library(data.table)
library(tidyverse)
library(rmarkdown)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(tximport)
library(Rsubread)
library(DESeq2)
library(UpSetR)
library(ensembldb)
library(gridExtra)
DB <- "EnsDb" # Set your DB of interest
AnnotationSpecies <- "Homo sapiens" # Set your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Bring annotation DB
# Filter annotation of interest
ahQuery <- query(ah,
pattern=c(DB, AnnotationSpecies),
ignore.case=T)
# Select the most recent data
DBName <- mcols(ahQuery) %>%
rownames() %>%
tail(1)
anno.db <- ah[[DBName]]
# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(anno.db, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(anno.db,
AnnoKey,
keytype="TXID",
columns=c("GENEID", "GENENAME"))
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb) # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
## TXID GENEID GENENAME
## 1 ENST00000000233 ENSG00000004059 ARF5
## 2 ENST00000000412 ENSG00000003056 M6PR
## 3 ENST00000000442 ENSG00000173153 ESRRA
## 4 ENST00000001008 ENSG00000004478 FKBP4
## 5 ENST00000001146 ENSG00000003137 CYP26B1
## 6 ENST00000002125 ENSG00000003509 NDUFAF7
# This code chunk needs to be written by yourself
# Define sample names
SampleNames <- c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3))
# Aligner names
Aligners <- c("Salmon", "STAR", "HISAT2")
# Define group level
GroupLevel <- c("Mock", "Covid")
# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)
# Set a function for file paths
path.fn <- function(head, tail) {
vec <- c(paste0("../",
head, # head = e.g. "hisat2", "star", or "salmon"
"_output/",
SampleNames,
tail)) # tail = file name after SampleNames
return(vec)
}
# Define .sf file path
sf <- path.fn("salmon",
".salmon_quant/quant.sf")
# Define STAR file path
star <- path.fn("star",
"Aligned.sortedByCoord.out.bam")
# Define HISAT2 file path
hisat <- path.fn("hisat2",
".sorted.bam")
# Define sample groups
group <- c(rep("Mock", 3), rep("Covid", 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Salmon_path=sf,
STAR_path=star,
HISAT2_path=hisat)
# Assign row names with sample names
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group
## Mock-rep1 Mock-rep1 Mock
## Mock-rep2 Mock-rep2 Mock
## Mock-rep3 Mock-rep3 Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
## Salmon_path
## Mock-rep1 ../salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2 ../salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3 ../salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 ../salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 ../salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 ../salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf
## STAR_path
## Mock-rep1 ../star_output/Mock-rep1Aligned.sortedByCoord.out.bam
## Mock-rep2 ../star_output/Mock-rep2Aligned.sortedByCoord.out.bam
## Mock-rep3 ../star_output/Mock-rep3Aligned.sortedByCoord.out.bam
## SARS-CoV-2-rep1 ../star_output/SARS-CoV-2-rep1Aligned.sortedByCoord.out.bam
## SARS-CoV-2-rep2 ../star_output/SARS-CoV-2-rep2Aligned.sortedByCoord.out.bam
## SARS-CoV-2-rep3 ../star_output/SARS-CoV-2-rep3Aligned.sortedByCoord.out.bam
## HISAT2_path
## Mock-rep1 ../hisat2_output/Mock-rep1.sorted.bam
## Mock-rep2 ../hisat2_output/Mock-rep2.sorted.bam
## Mock-rep3 ../hisat2_output/Mock-rep3.sorted.bam
## SARS-CoV-2-rep1 ../hisat2_output/SARS-CoV-2-rep1.sorted.bam
## SARS-CoV-2-rep2 ../hisat2_output/SARS-CoV-2-rep2.sorted.bam
## SARS-CoV-2-rep3 ../hisat2_output/SARS-CoV-2-rep3.sorted.bam
# "mm10", "mm9", "hg38", or "hg19"
annot.inbuilt <- "hg38"
# GTF file path
annot.ext <- "../../SEQC/reference_GENCODE/gencode.v36.primary_assembly.annotation.gtf"
# annotation type:
# e.g.: "gene_id", "transcript_id", or "gene_name"
GTF.attrType <- "gene_id"
# Number of cores
nthread <- 16
# Set a function importing counts from BAM files with featureCounts()
fcounts.fn <- function(vec) {
fc <- featureCounts(files=vec, # a vector assigning BAM file paths
annot.inbuilt=annot.inbuilt,
annot.ext=annot.ext,
GTF.attrType=GTF.attrType,
isGTFAnnotationFile=T,
nthread=nthread,
isPairedEnd=F, # Set this parameter correctly
verbose=T)
return(fc$counts)
}
# Import gene level summarized counts
salmon.txi <- tximport(metadata$Salmon_path,
type = "salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T,
txOut=F) # TRUE for transcript level, FALSE for gene level
# Extract the counts and save as a data frame
salmon.counts <- salmon.txi$counts
# Explore the salmon count data frame
head(salmon.counts)
## [,1] [,2] [,3] [,4] [,5] [,6]
## ENSG00000000003 8991.942 7660.942 7504.000 11037.869 6193.989 7772.929
## ENSG00000000005 7.000 4.000 0.000 2.000 1.000 4.000
## ENSG00000000419 889.000 928.001 730.000 1211.001 777.000 991.000
## ENSG00000000457 699.297 564.374 566.745 832.783 437.987 596.733
## ENSG00000000460 452.703 366.157 397.262 740.218 388.013 514.268
## ENSG00000000938 0.000 1.000 0.000 0.000 0.000 0.000
dim(salmon.counts)
## [1] 60217 6
summary(salmon.counts)
## V1 V2 V3
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 1.0 Median : 1.0 Median : 0.0
## Mean : 444.8 Mean : 434.4 Mean : 385.6
## 3rd Qu.: 21.0 3rd Qu.: 20.0 3rd Qu.: 16.0
## Max. :1809942.0 Max. :1898215.0 Max. :1628670.0
## V4 V5 V6
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 1.0 Median : 0.0 Median : 0.0
## Mean : 562.2 Mean : 308.0 Mean : 427.3
## 3rd Qu.: 26.0 3rd Qu.: 13.8 3rd Qu.: 18.3
## Max. :1416833.0 Max. :775055.0 Max. :1265356.0
# Extract counts by running featureCounts()
star.counts <- fcounts.fn(metadata$STAR_path)
##
## ========== _____ _ _ ____ _____ ______ _____
## ===== / ____| | | | _ \| __ \| ____| /\ | __ \
## ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
## ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
## ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
## ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
## Rsubread 2.4.0
##
## //========================== featureCounts setting ===========================\\
## || ||
## || Input files : 6 BAM files ||
## || ||
## || Mock-rep1Aligned.sortedByCoord.out.bam ||
## || Mock-rep2Aligned.sortedByCoord.out.bam ||
## || Mock-rep3Aligned.sortedByCoord.out.bam ||
## || SARS-CoV-2-rep1Aligned.sortedByCoord.out.bam ||
## || SARS-CoV-2-rep2Aligned.sortedByCoord.out.bam ||
## || SARS-CoV-2-rep3Aligned.sortedByCoord.out.bam ||
## || ||
## || Paired-end : no ||
## || Count read pairs : no ||
## || Annotation : gencode.v36.primary_assembly.annotation.gtf ... ||
## || Dir for temp files : . ||
## || Threads : 16 ||
## || Level : meta-feature level ||
## || Multimapping reads : counted ||
## || Multi-overlapping reads : not counted ||
## || Min overlapping bases : 1 ||
## || ||
## \\============================================================================//
##
## //================================= Running ==================================\\
## || ||
## || Load annotation file gencode.v36.primary_assembly.annotation.gtf ... ||
## || Features : 1430132 ||
## || Meta-features : 60719 ||
## || Chromosomes/contigs : 47 ||
## || ||
## || Process BAM file Mock-rep1Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 54178083 ||
## || Successfully assigned alignments : 29102650 (53.7%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file Mock-rep2Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 52009761 ||
## || Successfully assigned alignments : 28582595 (55.0%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file Mock-rep3Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 44486311 ||
## || Successfully assigned alignments : 25165867 (56.6%) ||
## || Running time : 0.05 minutes ||
## || ||
## || Process BAM file SARS-CoV-2-rep1Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 65798929 ||
## || Successfully assigned alignments : 35937122 (54.6%) ||
## || Running time : 0.08 minutes ||
## || ||
## || Process BAM file SARS-CoV-2-rep2Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 34330831 ||
## || Successfully assigned alignments : 19826399 (57.8%) ||
## || Running time : 0.04 minutes ||
## || ||
## || Process BAM file SARS-CoV-2-rep3Aligned.sortedByCoord.out.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 50803344 ||
## || Successfully assigned alignments : 27376989 (53.9%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Write the final count table. ||
## || Write the read assignment summary. ||
## || ||
## \\============================================================================//
# Explore the STAR count data frame
head(star.counts)
## Mock-rep1Aligned.sortedByCoord.out.bam
## ENSG00000223972.5 24
## ENSG00000227232.5 52
## ENSG00000278267.1 0
## ENSG00000243485.5 8
## ENSG00000284332.1 0
## ENSG00000237613.2 327
## Mock-rep2Aligned.sortedByCoord.out.bam
## ENSG00000223972.5 13
## ENSG00000227232.5 46
## ENSG00000278267.1 0
## ENSG00000243485.5 9
## ENSG00000284332.1 0
## ENSG00000237613.2 311
## Mock-rep3Aligned.sortedByCoord.out.bam
## ENSG00000223972.5 17
## ENSG00000227232.5 32
## ENSG00000278267.1 0
## ENSG00000243485.5 7
## ENSG00000284332.1 0
## ENSG00000237613.2 270
## SARS-CoV-2-rep1Aligned.sortedByCoord.out.bam
## ENSG00000223972.5 29
## ENSG00000227232.5 39
## ENSG00000278267.1 0
## ENSG00000243485.5 5
## ENSG00000284332.1 0
## ENSG00000237613.2 101
## SARS-CoV-2-rep2Aligned.sortedByCoord.out.bam
## ENSG00000223972.5 6
## ENSG00000227232.5 26
## ENSG00000278267.1 0
## ENSG00000243485.5 2
## ENSG00000284332.1 0
## ENSG00000237613.2 97
## SARS-CoV-2-rep3Aligned.sortedByCoord.out.bam
## ENSG00000223972.5 16
## ENSG00000227232.5 45
## ENSG00000278267.1 0
## ENSG00000243485.5 10
## ENSG00000284332.1 0
## ENSG00000237613.2 116
dim(star.counts)
## [1] 60719 6
summary(star.counts)
## Mock-rep1Aligned.sortedByCoord.out.bam Mock-rep2Aligned.sortedByCoord.out.bam
## Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 2.0 Median : 1.0
## Mean : 479.3 Mean : 470.7
## 3rd Qu.: 31.0 3rd Qu.: 29.0
## Max. :2096551.0 Max. :2203523.0
## Mock-rep3Aligned.sortedByCoord.out.bam
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 1.0
## Mean : 414.5
## 3rd Qu.: 23.0
## Max. :1850592.0
## SARS-CoV-2-rep1Aligned.sortedByCoord.out.bam
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 2.0
## Mean : 591.9
## 3rd Qu.: 38.0
## Max. :1464055.0
## SARS-CoV-2-rep2Aligned.sortedByCoord.out.bam
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 1.0
## Mean : 326.5
## 3rd Qu.: 20.0
## Max. :887660.0
## SARS-CoV-2-rep3Aligned.sortedByCoord.out.bam
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 1.0
## Mean : 450.9
## 3rd Qu.: 27.0
## Max. :1410329.0
# Extract counts by running featureCounts()
hisat2.counts <- fcounts.fn(metadata$HISAT2_path)
##
## ========== _____ _ _ ____ _____ ______ _____
## ===== / ____| | | | _ \| __ \| ____| /\ | __ \
## ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
## ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
## ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
## ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
## Rsubread 2.4.0
##
## //========================== featureCounts setting ===========================\\
## || ||
## || Input files : 6 BAM files ||
## || ||
## || Mock-rep1.sorted.bam ||
## || Mock-rep2.sorted.bam ||
## || Mock-rep3.sorted.bam ||
## || SARS-CoV-2-rep1.sorted.bam ||
## || SARS-CoV-2-rep2.sorted.bam ||
## || SARS-CoV-2-rep3.sorted.bam ||
## || ||
## || Paired-end : no ||
## || Count read pairs : no ||
## || Annotation : gencode.v36.primary_assembly.annotation.gtf ... ||
## || Dir for temp files : . ||
## || Threads : 16 ||
## || Level : meta-feature level ||
## || Multimapping reads : counted ||
## || Multi-overlapping reads : not counted ||
## || Min overlapping bases : 1 ||
## || ||
## \\============================================================================//
##
## //================================= Running ==================================\\
## || ||
## || Load annotation file gencode.v36.primary_assembly.annotation.gtf ... ||
## || Features : 1430132 ||
## || Meta-features : 60719 ||
## || Chromosomes/contigs : 47 ||
## || ||
## || Process BAM file Mock-rep1.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 54439819 ||
## || Successfully assigned alignments : 27242652 (50.0%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file Mock-rep2.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 51357583 ||
## || Successfully assigned alignments : 26791035 (52.2%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Process BAM file Mock-rep3.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 44166286 ||
## || Successfully assigned alignments : 23859049 (54.0%) ||
## || Running time : 0.05 minutes ||
## || ||
## || Process BAM file SARS-CoV-2-rep1.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 68677505 ||
## || Successfully assigned alignments : 33942278 (49.4%) ||
## || Running time : 0.08 minutes ||
## || ||
## || Process BAM file SARS-CoV-2-rep2.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 35316473 ||
## || Successfully assigned alignments : 18758834 (53.1%) ||
## || Running time : 0.04 minutes ||
## || ||
## || Process BAM file SARS-CoV-2-rep3.sorted.bam... ||
## || ||
## || Chromosomes/contigs in input file but not in annotation ||
## || KI270425.1 ||
## || KI270303.1 ||
## || KI270384.1 ||
## || KI270320.1 ||
## || KI270438.1 ||
## || KI270316.1 ||
## || KI270581.1 ||
## || KI270333.1 ||
## || KI270757.1 ||
## || KI270329.1 ||
## || KI270509.1 ||
## || KI270468.1 ||
## || KI270530.1 ||
## || KI270710.1 ||
## || KI270363.1 ||
## || KI270706.1 ||
## || KI270539.1 ||
## || KI270417.1 ||
## || KI270723.1 ||
## || KI270376.1 ||
## || KI270719.1 ||
## || KI270740.1 ||
## || KI270312.1 ||
## || KI270393.1 ||
## || KI270736.1 ||
## || KI270389.1 ||
## || GL000224.1 ||
## || KI270753.1 ||
## || KI270749.1 ||
## || KI270590.1 ||
## || KI270338.1 ||
## || KI270522.1 ||
## || KI270518.1 ||
## || KI270372.1 ||
## || KI270715.1 ||
## || KI270548.1 ||
## || KI270732.1 ||
## || KI270304.1 ||
## || KI270385.1 ||
## || KI270745.1 ||
## || KI270317.1 ||
## || KI270582.1 ||
## || KI270334.1 ||
## || KI270364.1 ||
## || KI270707.1 ||
## || KI270544.1 ||
## || KI270422.1 ||
## || KI270418.1 ||
## || KI270381.1 ||
## || KI270724.1 ||
## || KI270435.1 ||
## || GL000208.1 ||
## || KI270741.1 ||
## || KI270394.1 ||
## || KI270737.1 ||
## || KI270330.1 ||
## || KI270448.1 ||
## || KI270754.1 ||
## || KI270510.1 ||
## || KI270591.1 ||
## || KI270587.1 ||
## || KI270465.1 ||
## || KI270519.1 ||
## || KI270414.1 ||
## || KI270720.1 ||
## || KI270373.1 ||
## || KI270716.1 ||
## || KI270390.1 ||
## || KI270305.1 ||
## || KI270386.1 ||
## || KI270729.1 ||
## || GL000221.1 ||
## || KI270322.1 ||
## || KI270746.1 ||
## || KI270583.1 ||
## || KI270579.1 ||
## || KI270335.1 ||
## || KI270515.1 ||
## || KI270528.1 ||
## || KI270712.1 ||
## || KI270708.1 ||
## || KI270423.1 ||
## || KI270419.1 ||
## || KI270382.1 ||
## || KI270725.1 ||
## || KI270378.1 ||
## || KI270742.1 ||
## || KI270395.1 ||
## || KI270738.1 ||
## || GL000226.1 ||
## || KI270755.1 ||
## || KI270511.1 ||
## || KI270507.1 ||
## || KI270588.1 ||
## || KI270466.1 ||
## || GL000008.2 ||
## || KI270374.1 ||
## || KI270717.1 ||
## || KI270310.1 ||
## || KI270391.1 ||
## || KI270387.1 ||
## || KI270751.1 ||
## || KI270747.1 ||
## || KI270584.1 ||
## || KI270340.1 ||
## || KI270336.1 ||
## || KI270516.1 ||
## || KI270411.1 ||
## || KI270529.1 ||
## || KI270366.1 ||
## || KI270709.1 ||
## || KI270424.1 ||
## || KI270730.1 ||
## || KI270302.1 ||
## || KI270383.1 ||
## || KI270379.1 ||
## || GL000214.1 ||
## || KI270743.1 ||
## || KI270315.1 ||
## || KI270396.1 ||
## || KI270739.1 ||
## || KI270580.1 ||
## || KI270756.1 ||
## || KI270512.1 ||
## || KI270593.1 ||
## || KI270508.1 ||
## || KI270589.1 ||
## || KI270467.1 ||
## || KI270362.1 ||
## || KI270420.1 ||
## || KI270538.1 ||
## || KI270722.1 ||
## || KI270375.1 ||
## || KI270718.1 ||
## || KI270311.1 ||
## || KI270429.1 ||
## || KI270392.1 ||
## || KI270735.1 ||
## || KI270388.1 ||
## || KI270752.1 ||
## || KI270748.1 ||
## || KI270337.1 ||
## || KI270521.1 ||
## || KI270517.1 ||
## || KI270412.1 ||
## || KI270371.1 ||
## || KI270714.1 ||
## || ||
## || Single-end reads are included. ||
## || Total alignments : 52224139 ||
## || Successfully assigned alignments : 26005814 (49.8%) ||
## || Running time : 0.06 minutes ||
## || ||
## || Write the final count table. ||
## || Write the read assignment summary. ||
## || ||
## \\============================================================================//
# Explore the HISAT2 count data frame
head(hisat2.counts)
## Mock-rep1.sorted.bam Mock-rep2.sorted.bam
## ENSG00000223972.5 20 11
## ENSG00000227232.5 46 62
## ENSG00000278267.1 0 0
## ENSG00000243485.5 6 8
## ENSG00000284332.1 0 0
## ENSG00000237613.2 267 256
## Mock-rep3.sorted.bam SARS-CoV-2-rep1.sorted.bam
## ENSG00000223972.5 12 21
## ENSG00000227232.5 36 57
## ENSG00000278267.1 0 0
## ENSG00000243485.5 4 2
## ENSG00000284332.1 0 0
## ENSG00000237613.2 231 87
## SARS-CoV-2-rep2.sorted.bam SARS-CoV-2-rep3.sorted.bam
## ENSG00000223972.5 4 10
## ENSG00000227232.5 35 57
## ENSG00000278267.1 0 0
## ENSG00000243485.5 2 6
## ENSG00000284332.1 0 0
## ENSG00000237613.2 81 89
dim(hisat2.counts)
## [1] 60719 6
summary(hisat2.counts)
## Mock-rep1.sorted.bam Mock-rep2.sorted.bam Mock-rep3.sorted.bam
## Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 1.0 Median : 1.0 Median : 1.0
## Mean : 448.7 Mean : 441.2 Mean : 392.9
## 3rd Qu.: 27.0 3rd Qu.: 26.0 3rd Qu.: 21.0
## Max. :1876599.0 Max. :2004351.0 Max. :1678512.0
## SARS-CoV-2-rep1.sorted.bam SARS-CoV-2-rep2.sorted.bam
## Min. : 0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 0.0
## Median : 2 Median : 1.0
## Mean : 559 Mean : 308.9
## 3rd Qu.: 33 3rd Qu.: 17.0
## Max. :1410876 Max. :804577.0
## SARS-CoV-2-rep3.sorted.bam
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 1.0
## Mean : 428.3
## 3rd Qu.: 24.0
## Max. :1328490.0
countList <- list(salmon.counts,
star.counts,
hisat2.counts)
# Assign names of the count data frames in the count list
names(countList) <- Aligners
# Set a function cleaning the count data frame
clean.fn <- function(df) {
# Convert to a data frame
df <- as.data.frame(df)
# Assign column names
names(df) <- SampleNames
# Bring row names to a column
df <- df %>% rownames_to_column(var="GENEID")
return(df)
}
# Set a function to drop GENEID version
clean.annotation.fn <- function(df) {
# Re-annotate without version specification
df <- separate(df, "GENEID", c("GENEID", "Version"))
# Remove version column
df <- df[, colnames(df) != "Version"]
return(df)
}
# Move GENEID to a column
for (x in Aligners) {
countList[[x]] <- clean.fn(countList[[x]])
}
# Remove version of GENEID and duplicated rows in STAR & HISAT2 count tables
for (x in Aligners) {
countList[[x]] <- clean.annotation.fn(countList[[x]]) %>%
distinct()
}
# Explore the cleaned count data frames
head(countList[[1]])
## GENEID Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1 ENSG00000000003 8991.942 7660.942 7504.000 11037.869 6193.989
## 2 ENSG00000000005 7.000 4.000 0.000 2.000 1.000
## 3 ENSG00000000419 889.000 928.001 730.000 1211.001 777.000
## 4 ENSG00000000457 699.297 564.374 566.745 832.783 437.987
## 5 ENSG00000000460 452.703 366.157 397.262 740.218 388.013
## 6 ENSG00000000938 0.000 1.000 0.000 0.000 0.000
## SARS-CoV-2-rep3
## 1 7772.929
## 2 4.000
## 3 991.000
## 4 596.733
## 5 514.268
## 6 0.000
head(countList[[2]])
## GENEID Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1 ENSG00000223972 24 13 17 29 6
## 2 ENSG00000227232 52 46 32 39 26
## 3 ENSG00000278267 0 0 0 0 0
## 4 ENSG00000243485 8 9 7 5 2
## 5 ENSG00000284332 0 0 0 0 0
## 6 ENSG00000237613 327 311 270 101 97
## SARS-CoV-2-rep3
## 1 16
## 2 45
## 3 0
## 4 10
## 5 0
## 6 116
head(countList[[3]])
## GENEID Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1 ENSG00000223972 20 11 12 21 4
## 2 ENSG00000227232 46 62 36 57 35
## 3 ENSG00000278267 0 0 0 0 0
## 4 ENSG00000243485 6 8 4 2 2
## 5 ENSG00000284332 0 0 0 0 0
## 6 ENSG00000237613 267 256 231 87 81
## SARS-CoV-2-rep3
## 1 10
## 2 57
## 3 0
## 4 6
## 5 0
## 6 89
dim(countList[[1]])
## [1] 60217 7
dim(countList[[2]])
## [1] 60675 7
dim(countList[[3]])
## [1] 60695 7
sum(duplicated(countList[[1]]))
## [1] 0
sum(duplicated(countList[[2]]))
## [1] 0
sum(duplicated(countList[[3]]))
## [1] 0
# Convert Salmon counts to integers
countList[["Salmon"]] <- cbind(GENEID=countList[["Salmon"]][, "GENEID"],
round(countList[["Salmon"]][,
colnames(countList[["Salmon"]]) %in% SampleNames]))
# Explore the cleaned count data frames
head(countList[[1]])
## GENEID Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1 ENSG00000000003 8992 7661 7504 11038 6194
## 2 ENSG00000000005 7 4 0 2 1
## 3 ENSG00000000419 889 928 730 1211 777
## 4 ENSG00000000457 699 564 567 833 438
## 5 ENSG00000000460 453 366 397 740 388
## 6 ENSG00000000938 0 1 0 0 0
## SARS-CoV-2-rep3
## 1 7773
## 2 4
## 3 991
## 4 597
## 5 514
## 6 0
# Set a function generating a data frame with sequencing depth
seq.depth.fn <- function(df, aligner) {
seqdf <- as.data.frame(colSums(df[, SampleNames])) %>%
rownames_to_column (var="Sample") %>%
mutate(Aligner=aligner)
names(seqdf) <- c("Sample", "Count", "Aligner")
return(seqdf)
}
# Set a function for a bar plot comparing values
comparing.barplot.fn <- function(df, yval, title, ytitle) {
ggplot(df,
aes(x=Sample, y=yval, group=Aligner, fill=Aligner)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1)) +
ggtitle(title) +
ylab(ytitle)
}
# Initialize the seq depth data frame with the first aligner
seq.depth.df <- seq.depth.fn(countList[[1]], Aligners[1])
# Extend the seq depth data frame with the rest of aligners
for (x in Aligners) {
if (x %in% Aligners[2:length(Aligners)]) {
seq.depth.df <- rbind(seq.depth.df,
seq.depth.fn(countList[[x]], x))
}
}
# Explore how the data frame
print(seq.depth.df)
## Sample Count Aligner
## 1 Mock-rep1 26783846 Salmon
## 2 Mock-rep2 26159682 Salmon
## 3 Mock-rep3 23221690 Salmon
## 4 SARS-CoV-2-rep1 33853608 Salmon
## 5 SARS-CoV-2-rep2 18548485 Salmon
## 6 SARS-CoV-2-rep3 25733102 Salmon
## 7 Mock-rep1 29098819 STAR
## 8 Mock-rep2 28578955 STAR
## 9 Mock-rep3 25163540 STAR
## 10 SARS-CoV-2-rep1 35933153 STAR
## 11 SARS-CoV-2-rep2 19824100 STAR
## 12 SARS-CoV-2-rep3 27373805 STAR
## 13 Mock-rep1 27242642 HISAT2
## 14 Mock-rep2 26791029 HISAT2
## 15 Mock-rep3 23859044 HISAT2
## 16 SARS-CoV-2-rep1 33942259 HISAT2
## 17 SARS-CoV-2-rep2 18758823 HISAT2
## 18 SARS-CoV-2-rep3 26005791 HISAT2
summary(seq.depth.df)
## Sample Count Aligner
## Length:18 Min. :18548485 Length:18
## Class :character 1st Qu.:24185168 Class :character
## Mode :character Median :26471764 Mode :character
## Mean :26492910
## 3rd Qu.:28277668
## Max. :35933153
# Convert character vectors to factors
seq.depth.df$Sample <- factor(seq.depth.df$Sample,
levels=SampleNames)
seq.depth.df$Aligner <- factor(seq.depth.df$Aligner,
levels=Aligners)
# Create a plot presenting sequencing depth
comparing.barplot.fn(seq.depth.df,
seq.depth.df$Count,
"Sequencing Depth by Sample and Aligner",
"Count")
# Initialize new lists for storing dds objects
ddsList <- countList
# Initialize new lists for storing vsd objects
vsdList <- countList
for (x in Aligners) {
# Create a count matrix from the count data frame
m <- countList[[x]][, colnames(countList[[x]]) != "GENEID"] %>%
as.matrix()
# Assigne row names
rownames(m) <- countList[[x]]$GENEID
# Generate a DESeq2 object
ddsList[[x]] <- DESeqDataSetFromMatrix(m,
colData=metadata,
design=~Group)
# Conduct vst
vsdList[[x]] <- varianceStabilizingTransformation(ddsList[[x]],
blind=TRUE)
}
# Explore generated objects
summary(ddsList)
## Length Class Mode
## Salmon 60217 DESeqDataSet S4
## STAR 60675 DESeqDataSet S4
## HISAT2 60695 DESeqDataSet S4
summary(vsdList)
## Length Class Mode
## Salmon 60217 DESeqTransform S4
## STAR 60675 DESeqTransform S4
## HISAT2 60695 DESeqTransform S4
# Calculate and add size factors to the DEseq object
for (x in Aligners) {
ddsList[[x]] <- estimateSizeFactors(ddsList[[x]])
}
# Set a function summarizing size factors by aligner to a data frame
sfactor.fn <- function(df, aligner) {
sizefactor <- as.data.frame(round(sizeFactors(df), 3)) %>%
rownames_to_column(var="Sample") %>%
mutate(Aligner=aligner)
names(sizefactor) <- c("Sample", "Size_Factor", "Aligner")
return(sizefactor)
}
# Initialize a data frame with the first aligner
size.factor.df <- sfactor.fn(ddsList[[1]], Aligners[1])
for (x in Aligners) {
if (x != Aligners[1]) {
size.factor.df <- rbind(size.factor.df,
sfactor.fn(ddsList[[x]], x))
}
}
# Explore the data frame
print(size.factor.df)
## Sample Size_Factor Aligner
## 1 Mock-rep1 1.095 Salmon
## 2 Mock-rep2 1.050 Salmon
## 3 Mock-rep3 0.889 Salmon
## 4 SARS-CoV-2-rep1 1.363 Salmon
## 5 SARS-CoV-2-rep2 0.754 Salmon
## 6 SARS-CoV-2-rep3 1.023 Salmon
## 7 Mock-rep1 1.101 STAR
## 8 Mock-rep2 1.056 STAR
## 9 Mock-rep3 0.886 STAR
## 10 SARS-CoV-2-rep1 1.363 STAR
## 11 SARS-CoV-2-rep2 0.747 STAR
## 12 SARS-CoV-2-rep3 1.020 STAR
## 13 Mock-rep1 1.091 HISAT2
## 14 Mock-rep2 1.048 HISAT2
## 15 Mock-rep3 0.894 HISAT2
## 16 SARS-CoV-2-rep1 1.359 HISAT2
## 17 SARS-CoV-2-rep2 0.748 HISAT2
## 18 SARS-CoV-2-rep3 1.025 HISAT2
# Convert character vectors to factors
size.factor.df$Sample <- factor(size.factor.df$Sample,
levels=SampleNames)
size.factor.df$Aligner <- factor(size.factor.df$Aligner,
levels=Aligners)
# Plot calculated size factors
comparing.barplot.fn(size.factor.df,
size.factor.df$Size_Factor,
"Size Factors by Aligner and Sample",
"Size Factor") + geom_hline(yintercept=1, linetype="dashed", color="black", size=1)
for (x in Aligners) {
# Dispersion
ddsList[[x]] <- estimateDispersions(ddsList[[x]])
# Wald test
ddsList[[x]] <- nbinomWaldTest(ddsList[[x]])
}
# Explore generated data in the dds object
ddsList[[1]]
## class: DESeqDataSet
## dim: 60217 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(6): Sample Group ... HISAT2_path sizeFactor
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for sample pca
qcpca.fn <- function(obj, title) {
plotPCA(obj,
intgroup=GroupOfInterest,
returnData=FALSE) + theme_bw() + ggtitle(paste("PCA:", title))
}
# Print the plots
qcpca.fn(vsdList[[1]], Aligners[1])
qcpca.fn(vsdList[[2]], Aligners[2])
qcpca.fn(vsdList[[3]], Aligners[3])
# Heatmap annotation
HeatmapAnno <- metadata[, c("Sample", "Group")]
# Set a function generating a correlation heatmap
cheatmap.fn <- function(df, title) {
# Extract a normalized count matrix
vm <- assay(df)
# Generate a correlation matrix
cm <- cor(vm)
# Generate a heatmap
pheatmap(cm,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:", title))
}
# Print the heatmaps
cheatmap.fn(vsdList[[1]], Aligners[1])
cheatmap.fn(vsdList[[2]], Aligners[2])
cheatmap.fn(vsdList[[3]], Aligners[3])
# Run DESeq
for (x in Aligners) {
ddsList[[x]] <- DESeq(ddsList[[x]])
# Check result names
ResNames <- resultsNames(ddsList[[x]])
print(ResNames)
}
## [1] "Intercept" "Group_Covid_vs_Mock"
## [1] "Intercept" "Group_Covid_vs_Mock"
## [1] "Intercept" "Group_Covid_vs_Mock"
# Set a function plotting dispersion
dplot.fn <- function(dds, title) {
plotDispEsts(dds,
main=paste("Dispersion over Counts:", title))
}
# Plot dispersion patterns
dplot.fn(ddsList[[1]], Aligners[1])
dplot.fn(ddsList[[2]], Aligners[2])
dplot.fn(ddsList[[3]], Aligners[3])
# Do they fit well with the DESeq2 estimation model?
# Set FDR threshold alpha
alpha=0.1
# Set the coefficients to compare
Coef <- ResNames[-1]
print(Coef)
## [1] "Group_Covid_vs_Mock"
# Set a function to clean a result table
lfctable.fn <- function(df) {
df <- df %>%
rownames_to_column(var="GENEID") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
"< 0.1",
"> 0.1"))
return(df)
}
# Set a function extracting results
extract.lfc.fn <- function(dds) {
res <- results(dds, contrast=Contrast, alpha=alpha)
lfctable.fn(as.data.frame(res))
return(lfctable.fn(as.data.frame(res)))
}
# Initialize a list storing lfc data frames
lfcList <- countList
# Extract DE results
# The Contrast variable was defined in the previous chunk
# Extraction with no shrinkage
# alpha: FDR threshold
for (x in Aligners) {
lfcList[[x]] <- extract.lfc.fn(ddsList[[x]]) %>% mutate(Alignment=x)
print(head(lfcList[[x]]))
}
## GENEID baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 7978.6304705 0.001837277 0.1039045 0.01768236 0.98589226
## 2 ENSG00000000005 2.8179041 0.618136403 1.5239193 0.40562280 0.68501977
## 3 ENSG00000000419 900.9751735 -0.196744019 0.1266381 -1.55359294 0.12028154
## 4 ENSG00000000457 598.2936833 0.028644860 0.1403899 0.20403785 0.83832392
## 5 ENSG00000000460 461.5836589 -0.370269993 0.1543873 -2.39831902 0.01647051
## 6 ENSG00000000938 0.1587224 0.974924793 4.0804729 0.23892446 0.81116415
## padj FDR Alignment
## 1 0.99510282 > 0.1 Salmon
## 2 NA > 0.1 Salmon
## 3 0.33184769 > 0.1 Salmon
## 4 0.93983342 > 0.1 Salmon
## 5 0.08055229 < 0.1 Salmon
## 6 NA > 0.1 Salmon
## GENEID baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000223972 16.382284 0.2105319 0.5783491 0.3640221 7.158415e-01
## 2 ENSG00000227232 39.073336 0.2487484 0.3713604 0.6698301 5.029661e-01
## 3 ENSG00000278267 0.000000 NA NA NA NA
## 4 ENSG00000243485 6.640203 0.5409279 0.8752465 0.6180293 5.365560e-01
## 5 ENSG00000284332 0.000000 NA NA NA NA
## 6 ENSG00000237613 202.330477 1.5163697 0.2143040 7.0757887 1.486011e-12
## padj FDR Alignment
## 1 8.876923e-01 > 0.1 STAR
## 2 7.609714e-01 > 0.1 STAR
## 3 NA > 0.1 STAR
## 4 NA > 0.1 STAR
## 5 NA > 0.1 STAR
## 6 1.128834e-10 < 0.1 STAR
## GENEID baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000223972 12.134455 0.42992171 0.6624624 0.64897530 5.163543e-01
## 2 ENSG00000227232 47.652923 -0.02201754 0.3417785 -0.06442051 9.486354e-01
## 3 ENSG00000278267 0.000000 NA NA NA NA
## 4 ENSG00000243485 4.601038 0.83527457 1.0474104 0.79746639 4.251802e-01
## 5 ENSG00000284332 0.000000 NA NA NA NA
## 6 ENSG00000237613 167.762489 1.55049410 0.2207492 7.02378057 2.159439e-12
## padj FDR Alignment
## 1 7.804067e-01 > 0.1 HISAT2
## 2 9.834913e-01 > 0.1 HISAT2
## 3 NA > 0.1 HISAT2
## 4 NA > 0.1 HISAT2
## 5 NA > 0.1 HISAT2
## 6 1.632049e-10 < 0.1 HISAT2
# Initialize a data frame storing total lfc results across the aligners
lfc.dataframe <- lfcList[[1]]
for (x in Aligners[2:length(Aligners)]) {
lfc.dataframe <- rbind(lfc.dataframe,
lfcList[[x]])
}
lfc.dataframe$Alignment <- factor(lfc.dataframe$Alignment,
levels=Aligners)
# Plot distribution of FDR
ggplot(lfc.dataframe,
aes(x=padj, y=..count.., color=Alignment)) +
geom_density(size=1) +
theme_bw() +
scale_x_log10() +
ggtitle("Distribution of False Discovery Rate (FDR) by Aligner") +
ylab("Count") +
xlim(0.00001, 1) +
geom_vline(xintercept=alpha,
color="black",
size=1, linetype="dashed") + scale_x_continuous(breaks=seq(0, 1, by=0.1))
valid.lfc.df <- subset(lfc.dataframe, FDR == paste("<", alpha))
ggplot(valid.lfc.df,
aes(x=log2FoldChange,
y=..count..,
color=Alignment)) +
geom_density(size=1) +
theme_bw() +
geom_vline(xintercept=c(-1, 1),
linetype="dashed", color="black", size=1) +
ggtitle(paste0("Distribution of log2FoldChange Values by Aligner (FDR < ", alpha, ")")) +
ylab("Count") +
xlim(-10, 10) # Change xlim by datatype
# Set ylim: has to adjusted by users depending on data
yl <- c(-12, 12)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Create MA plots by Aligner
ggplot(lfc.dataframe, aes(x=baseMean, y=log2FoldChange, color=FDR)) +
geom_point() +
facet_grid(~Alignment) +
scale_x_log10() +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot")) +
ylim(yl[1], yl[2]) +
theme(strip.text.x=element_text(size=10)) +
geom_hline(yintercept=c(mLog[1], mLog[2]),
linetype="dashed", color="red")
# Initialize a list
heatmap.df.List <- lfcList
# Filter genes with FDR < alpha and absolute log2FoldChange > 1
for (x in Aligners) {
# Set a logical vector filtering FDR below alpha
is.fdr.valid <- lfcList[[x]]$FDR == paste("<", alpha)
# Set a logical vector filtering absolute lfc above 1
is.lfc.large <- abs(lfcList[[x]]$log2FoldChange) > mLog[2]
# Extract total normalized counts
norm.counts <- counts(ddsList[[x]], normalized=T)
# Save filtered genes only from the normalized count data
heatmap.df.List[[x]] <- norm.counts[is.fdr.valid & is.lfc.large,]
}
# Explore the cleaned data frames
head(heatmap.df.List[[1]])
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000002746 42.919753 35.23637 27.006584 82.930962 55.73656
## ENSG00000004399 6.392304 12.38035 9.002195 35.961214 26.54122
## ENSG00000005884 61.183477 78.09141 82.145025 330.256045 281.33690
## ENSG00000006327 4.565931 11.42801 15.753840 64.583404 71.66129
## ENSG00000006747 627.358939 690.44234 697.670075 1453.860501 1601.76243
## ENSG00000007001 39.267008 32.37936 34.883504 8.806828 11.94355
## SARS-CoV-2-rep3
## ENSG00000002746 75.28518
## ENSG00000004399 35.19827
## ENSG00000005884 185.76863
## ENSG00000006327 46.93102
## ENSG00000006747 2004.34571
## ENSG00000007001 17.59913
head(heatmap.df.List[[2]])
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1
## ENSG00000237613 296.941593 294.510344 304.909944 74.09817
## ENSG00000177133 170.718714 143.940747 146.808491 33.01404
## ENSG00000097021 34.506974 35.038208 35.008179 129.85521
## ENSG00000049249 1.816157 14.204679 1.129296 96.10753
## ENSG00000284747 34.506974 40.720080 27.103106 172.40663
## ENSG00000284716 2.724235 5.681872 2.258592 13.93926
## SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000237613 129.79225 113.730562
## ENSG00000177133 48.17032 67.650075
## ENSG00000097021 149.86322 86.278357
## ENSG00000049249 104.36903 26.471769
## ENSG00000284747 148.52515 125.495792
## ENSG00000284716 21.40903 7.843487
head(heatmap.df.List[[3]])
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000237613 244.74613 244.27485 258.489470 64.00598 108.24497
## ENSG00000177133 163.16409 135.49620 139.875254 32.37084 45.43616
## ENSG00000158286 58.66574 33.39695 40.284073 24.27813 26.72715
## ENSG00000097021 32.08283 32.44275 29.094053 125.80486 134.97212
## ENSG00000049249 0.00000 14.31298 2.238004 93.43402 101.56318
## ENSG00000284747 32.99948 39.12214 22.380041 165.53271 144.32663
## SARS-CoV-2-rep3
## ENSG00000237613 86.81353
## ENSG00000177133 63.40314
## ENSG00000158286 13.65606
## ENSG00000097021 79.98550
## ENSG00000049249 21.45953
## ENSG00000284747 119.97825
dim(heatmap.df.List[[1]])
## [1] 1084 6
dim(heatmap.df.List[[2]])
## [1] 1244 6
dim(heatmap.df.List[[3]])
## [1] 1240 6
pheatmap(heatmap.df.List[[3]],
annotation=HeatmapAnno,
scale="row",
show_rownames=F)
# Set a function creating a profiling heatmap
profile.heatmap.fn <- function(df, title) {
pheatmap(df,
annotation=HeatmapAnno,
scale="row",
show_rownames=F,
main=paste("Expression Profiling by", title, "(FDR < 0.1, absolute log2FoldChange > 1)"))
}
# Print the expression heatmaps
profile.heatmap.fn(heatmap.df.List[[Aligners[1]]], Aligners[1])
profile.heatmap.fn(heatmap.df.List[[Aligners[2]]], Aligners[2])
profile.heatmap.fn(heatmap.df.List[[Aligners[3]]], Aligners[3])
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing number of NA genes by type
NA.genes <- lfc.dataframe %>%
group_by(Alignment) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Alignment) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NA.genes,
aes(x=Type, y=Number, group=Alignment, fill=Alignment, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(GENEID) %>%
summarize(num.trans=n_distinct(TXID))
# Create an empty list storing significant gene lfc tables
sigList <- list()
# Filter significant genes' lfc and save in the list
for (x in Aligners) {
sigList[[x]] <- subset(lfcList[[x]], FDR == paste("<", alpha))
}
# Create a vector storing column names
c_names <- colnames(sigList[[1]])[-1]
c_names <- c("GENEID",
paste0(c_names, "_", Aligners[1]),
paste0(c_names, "_", Aligners[2]),
paste0(c_names, "_", Aligners[3]))
# Join tables by GENEID
lfcTable <- sigList[[1]] %>%
inner_join(sigList[[2]], by="GENEID") %>%
inner_join(sigList[[3]], by="GENEID")
# Rename the columns
names(lfcTable) <- c_names
# Calculate differences
lfcTable <- lfcTable %>%
mutate(mean_SA_ST=baseMean_Salmon - baseMean_STAR,
mean_SA_HI=baseMean_Salmon - baseMean_HISAT2,
mean_ST_HI=baseMean_STAR - baseMean_HISAT2,
lfc_SA_ST=log2FoldChange_Salmon - log2FoldChange_STAR,
lfc_SA_HI=log2FoldChange_Salmon - log2FoldChange_HISAT2,
lfc_ST_HI=log2FoldChange_STAR - log2FoldChange_HISAT2,
FDR_SA_ST=padj_Salmon - padj_STAR,
FDR_SA_HI=padj_Salmon - padj_HISAT2,
FDR_ST_HI=padj_STAR - padj_HISAT2) %>%
left_join(AnnoDb.ntrans, by="GENEID")
# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {
vec <- c()
for (i in 1:num) {
vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
}
return(vec)
}
# Explore the output table
head(lfcTable)
## GENEID baseMean_Salmon log2FoldChange_Salmon lfcSE_Salmon
## 1 ENSG00000001084 1587.7857 0.5853638 0.1350708
## 2 ENSG00000001497 685.4758 -0.3467204 0.1440032
## 3 ENSG00000001561 2240.0327 0.6073274 0.1311473
## 4 ENSG00000002746 53.1859 -1.0269975 0.3628482
## 5 ENSG00000002834 1075.5410 -0.6594280 0.1275030
## 6 ENSG00000003393 2980.8300 -0.3125891 0.1081754
## stat_Salmon pvalue_Salmon padj_Salmon FDR_Salmon Alignment_Salmon
## 1 4.333754 1.465878e-05 2.334551e-04 < 0.1 Salmon
## 2 -2.407727 1.605219e-02 7.910015e-02 < 0.1 Salmon
## 3 4.630878 3.641179e-06 6.871668e-05 < 0.1 Salmon
## 4 -2.830378 4.649307e-03 3.015041e-02 < 0.1 Salmon
## 5 -5.171864 2.317703e-07 6.121000e-06 < 0.1 Salmon
## 6 -2.889651 3.856695e-03 2.593355e-02 < 0.1 Salmon
## baseMean_STAR log2FoldChange_STAR lfcSE_STAR stat_STAR pvalue_STAR
## 1 1579.75977 0.5664292 0.1342064 4.220584 2.436705e-05
## 2 726.26711 -0.3417999 0.1371200 -2.492707 1.267735e-02
## 3 2273.37550 0.5989872 0.1297204 4.617524 3.883459e-06
## 4 58.72536 -1.0444676 0.3265005 -3.198977 1.379162e-03
## 5 932.70171 -0.6567112 0.1267828 -5.179814 2.221074e-07
## 6 3143.99551 -0.3221310 0.1063918 -3.027779 2.463580e-03
## padj_STAR FDR_STAR Alignment_STAR baseMean_HISAT2 log2FoldChange_HISAT2
## 1 3.973616e-04 < 0.1 STAR 1519.05864 0.5833814
## 2 6.833087e-02 < 0.1 STAR 687.30008 -0.3278163
## 3 7.878088e-05 < 0.1 STAR 2236.78012 0.5980731
## 4 1.189267e-02 < 0.1 STAR 57.00722 -1.0240761
## 5 6.212536e-06 < 0.1 STAR 895.53964 -0.6698121
## 6 1.904315e-02 < 0.1 STAR 3047.53624 -0.3097336
## lfcSE_HISAT2 stat_HISAT2 pvalue_HISAT2 padj_HISAT2 FDR_HISAT2
## 1 0.1317808 4.426908 9.559346e-06 1.804589e-04 < 0.1
## 2 0.1383529 -2.369421 1.781595e-02 9.157123e-02 < 0.1
## 3 0.1303310 4.588880 4.456302e-06 9.161883e-05 < 0.1
## 4 0.3277511 -3.124554 1.780749e-03 1.523395e-02 < 0.1
## 5 0.1266770 -5.287558 1.239603e-07 3.828322e-06 < 0.1
## 6 0.1062535 -2.915043 3.556398e-03 2.650504e-02 < 0.1
## Alignment_HISAT2 mean_SA_ST mean_SA_HI mean_ST_HI lfc_SA_ST lfc_SA_HI
## 1 HISAT2 8.025889 68.727024 60.70114 0.018934584 0.001982376
## 2 HISAT2 -40.791314 -1.824283 38.96703 -0.004920541 -0.018904180
## 3 HISAT2 -33.342767 3.252613 36.59538 0.008340129 0.009254245
## 4 HISAT2 -5.539464 -3.821324 1.71814 0.017470176 -0.002921357
## 5 HISAT2 142.839305 180.001381 37.16208 -0.002716743 0.010384170
## 6 HISAT2 -163.165542 -66.706268 96.45927 0.009541858 -0.002855531
## lfc_ST_HI FDR_SA_ST FDR_SA_HI FDR_ST_HI num.trans
## 1 -0.0169522076 -1.639066e-04 5.299613e-05 2.169027e-04 14
## 2 -0.0139836398 1.076927e-02 -1.247108e-02 -2.324035e-02 25
## 3 0.0009141155 -1.006420e-05 -2.290216e-05 -1.283796e-05 1
## 4 -0.0203915329 1.825774e-02 1.491646e-02 -3.341288e-03 11
## 5 0.0131009125 -9.153651e-08 2.292678e-06 2.384215e-06 9
## 6 -0.0123973888 6.890397e-03 -5.714961e-04 -7.461893e-03 75
dim(lfcTable)
## [1] 3590 35
my.param <- c("baseMean", "log2FoldChange", "padj")
# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>%
dplyr::select(GENEID, num.trans, starts_with(my.param)) %>%
gather(Category, Value, -GENEID, -num.trans) %>%
separate(Category, c("Metric", "Aligner"), sep="_") %>% pivot_wider(names_from=Aligner, values_from=Value) %>%
group_by(Metric) %>%
nest() %>%
# Calculate correlation coefficients between aligners by metric
mutate(salmon.star.corr=map_dbl(data, ~ cor(.x$Salmon, .x$STAR)),
salmon.hisat.corr=map_dbl(data, ~ cor(.x$Salmon, .x$HISAT2)),
star.hisat.corr=map_dbl(data, ~ cor(.x$STAR, .x$HISAT2)))
# Explore the cleaned data frame
lfcTable.comp
## # A tibble: 3 x 5
## # Groups: Metric [3]
## Metric data salmon.star.corr salmon.hisat.co… star.hisat.corr
## <chr> <list> <dbl> <dbl> <dbl>
## 1 baseMean <tibble [3,590… 0.994 0.994 1.00
## 2 log2FoldCha… <tibble [3,590… 0.994 0.995 0.999
## 3 padj <tibble [3,590… 0.863 0.870 0.964
lfcTable.comp$data
## [[1]]
## # A tibble: 3,590 x 5
## GENEID num.trans Salmon STAR HISAT2
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ENSG00000001084 14 1588. 1580. 1519.
## 2 ENSG00000001497 25 685. 726. 687.
## 3 ENSG00000001561 1 2240. 2273. 2237.
## 4 ENSG00000002746 11 53.2 58.7 57.0
## 5 ENSG00000002834 9 1076. 933. 896.
## 6 ENSG00000003393 75 2981. 3144. 3048.
## 7 ENSG00000003756 26 7190. 5578. 5374.
## 8 ENSG00000004399 17 20.9 23.8 21.9
## 9 ENSG00000004700 8 2989. 1644. 1593.
## 10 ENSG00000004799 5 105. 108. 107.
## # … with 3,580 more rows
##
## [[2]]
## # A tibble: 3,590 x 5
## GENEID num.trans Salmon STAR HISAT2
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ENSG00000001084 14 0.585 0.566 0.583
## 2 ENSG00000001497 25 -0.347 -0.342 -0.328
## 3 ENSG00000001561 1 0.607 0.599 0.598
## 4 ENSG00000002746 11 -1.03 -1.04 -1.02
## 5 ENSG00000002834 9 -0.659 -0.657 -0.670
## 6 ENSG00000003393 75 -0.313 -0.322 -0.310
## 7 ENSG00000003756 26 0.269 0.258 0.261
## 8 ENSG00000004399 17 -1.82 -2.00 -1.85
## 9 ENSG00000004700 8 -0.394 -0.405 -0.393
## 10 ENSG00000004799 5 0.884 0.886 0.906
## # … with 3,580 more rows
##
## [[3]]
## # A tibble: 3,590 x 5
## GENEID num.trans Salmon STAR HISAT2
## <chr> <int> <dbl> <dbl> <dbl>
## 1 ENSG00000001084 14 0.000233 0.000397 0.000180
## 2 ENSG00000001497 25 0.0791 0.0683 0.0916
## 3 ENSG00000001561 1 0.0000687 0.0000788 0.0000916
## 4 ENSG00000002746 11 0.0302 0.0119 0.0152
## 5 ENSG00000002834 9 0.00000612 0.00000621 0.00000383
## 6 ENSG00000003393 75 0.0259 0.0190 0.0265
## 7 ENSG00000003756 26 0.0606 0.0795 0.0754
## 8 ENSG00000004399 17 0.0110 0.00102 0.00441
## 9 ENSG00000004700 8 0.0176 0.0161 0.0211
## 10 ENSG00000004799 5 0.0175 0.0102 0.00881
## # … with 3,580 more rows
# Prep vectors storing the correlation coefficient without duplication by comparison
Rsquared.salmon.star <- plotlabels.fn(lfcTable.comp$salmon.star.corr,
lfcTable.comp$data,
nrow(lfcTable.comp))
Rsquared.salmon.hisat <- plotlabels.fn(lfcTable.comp$salmon.hisat.corr,
lfcTable.comp$data,
nrow(lfcTable.comp))
Rsquared.star.hisat <- plotlabels.fn(lfcTable.comp$star.hisat.corr,
lfcTable.comp$data,
nrow(lfcTable.comp))
# Unnest the data frame and add columns storing correlation coefficients each
lfcTable.comp <- lfcTable.comp %>%
unnest(data)
head(lfcTable.comp)
## # A tibble: 6 x 9
## # Groups: Metric [1]
## Metric GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 baseM… ENSG0… 14 1588. 1580. 1519. 0.994 0.994
## 2 baseM… ENSG0… 25 685. 726. 687. 0.994 0.994
## 3 baseM… ENSG0… 1 2240. 2273. 2237. 0.994 0.994
## 4 baseM… ENSG0… 11 53.2 58.7 57.0 0.994 0.994
## 5 baseM… ENSG0… 9 1076. 933. 896. 0.994 0.994
## 6 baseM… ENSG0… 75 2981. 3144. 3048. 0.994 0.994
## # … with 1 more variable: star.hisat.corr <dbl>
lfcTable.comp$Rsquared.salmon.star <- Rsquared.salmon.star
lfcTable.comp$Rsquared.salmon.hisat <- Rsquared.salmon.hisat
lfcTable.comp$Rsquared.star.hisat <- Rsquared.star.hisat
# Explore the data frame
head(lfcTable.comp)
## # A tibble: 6 x 12
## # Groups: Metric [1]
## Metric GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 baseM… ENSG0… 14 1588. 1580. 1519. 0.994 0.994
## 2 baseM… ENSG0… 25 685. 726. 687. 0.994 0.994
## 3 baseM… ENSG0… 1 2240. 2273. 2237. 0.994 0.994
## 4 baseM… ENSG0… 11 53.2 58.7 57.0 0.994 0.994
## 5 baseM… ENSG0… 9 1076. 933. 896. 0.994 0.994
## 6 baseM… ENSG0… 75 2981. 3144. 3048. 0.994 0.994
## # … with 4 more variables: star.hisat.corr <dbl>, Rsquared.salmon.star <chr>,
## # Rsquared.salmon.hisat <chr>, Rsquared.star.hisat <chr>
# Nest the data frame by Metric
lfcTable.comp <- lfcTable.comp %>%
group_by(Metric) %>%
nest()
# Explore the data frame
lfcTable.comp
## # A tibble: 3 x 2
## # Groups: Metric [3]
## Metric data
## <chr> <list>
## 1 baseMean <tibble [3,590 × 11]>
## 2 log2FoldChange <tibble [3,590 × 11]>
## 3 padj <tibble [3,590 × 11]>
lfcTable.comp$data
## [[1]]
## # A tibble: 3,590 x 11
## GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG0000000… 14 1588. 1580. 1519. 0.994 0.994
## 2 ENSG0000000… 25 685. 726. 687. 0.994 0.994
## 3 ENSG0000000… 1 2240. 2273. 2237. 0.994 0.994
## 4 ENSG0000000… 11 53.2 58.7 57.0 0.994 0.994
## 5 ENSG0000000… 9 1076. 933. 896. 0.994 0.994
## 6 ENSG0000000… 75 2981. 3144. 3048. 0.994 0.994
## 7 ENSG0000000… 26 7190. 5578. 5374. 0.994 0.994
## 8 ENSG0000000… 17 20.9 23.8 21.9 0.994 0.994
## 9 ENSG0000000… 8 2989. 1644. 1593. 0.994 0.994
## 10 ENSG0000000… 5 105. 108. 107. 0.994 0.994
## # … with 3,580 more rows, and 4 more variables: star.hisat.corr <dbl>,
## # Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## # Rsquared.star.hisat <chr>
##
## [[2]]
## # A tibble: 3,590 x 11
## GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG0000000… 14 0.585 0.566 0.583 0.994 0.995
## 2 ENSG0000000… 25 -0.347 -0.342 -0.328 0.994 0.995
## 3 ENSG0000000… 1 0.607 0.599 0.598 0.994 0.995
## 4 ENSG0000000… 11 -1.03 -1.04 -1.02 0.994 0.995
## 5 ENSG0000000… 9 -0.659 -0.657 -0.670 0.994 0.995
## 6 ENSG0000000… 75 -0.313 -0.322 -0.310 0.994 0.995
## 7 ENSG0000000… 26 0.269 0.258 0.261 0.994 0.995
## 8 ENSG0000000… 17 -1.82 -2.00 -1.85 0.994 0.995
## 9 ENSG0000000… 8 -0.394 -0.405 -0.393 0.994 0.995
## 10 ENSG0000000… 5 0.884 0.886 0.906 0.994 0.995
## # … with 3,580 more rows, and 4 more variables: star.hisat.corr <dbl>,
## # Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## # Rsquared.star.hisat <chr>
##
## [[3]]
## # A tibble: 3,590 x 11
## GENEID num.trans Salmon STAR HISAT2 salmon.star.corr salmon.hisat.co…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG000… 14 2.33e-4 3.97e-4 1.80e-4 0.863 0.870
## 2 ENSG000… 25 7.91e-2 6.83e-2 9.16e-2 0.863 0.870
## 3 ENSG000… 1 6.87e-5 7.88e-5 9.16e-5 0.863 0.870
## 4 ENSG000… 11 3.02e-2 1.19e-2 1.52e-2 0.863 0.870
## 5 ENSG000… 9 6.12e-6 6.21e-6 3.83e-6 0.863 0.870
## 6 ENSG000… 75 2.59e-2 1.90e-2 2.65e-2 0.863 0.870
## 7 ENSG000… 26 6.06e-2 7.95e-2 7.54e-2 0.863 0.870
## 8 ENSG000… 17 1.10e-2 1.02e-3 4.41e-3 0.863 0.870
## 9 ENSG000… 8 1.76e-2 1.61e-2 2.11e-2 0.863 0.870
## 10 ENSG000… 5 1.75e-2 1.02e-2 8.81e-3 0.863 0.870
## # … with 3,580 more rows, and 4 more variables: star.hisat.corr <dbl>,
## # Rsquared.salmon.star <chr>, Rsquared.salmon.hisat <chr>,
## # Rsquared.star.hisat <chr>
# Set a function creating a scatter plot
comp.scatter.fn <- function(df,
xvar,
yvar,
label,
metric,
xlab, ylab,
xlog=F,
ylog=F) {
p <- ggplot(df, aes(x=xvar, y=yvar, color=log(num.trans), label=label)) +
geom_point(alpha=0.5) +
theme_bw() +
geom_text(size=5,
mapping=aes(x=Inf, y=Inf),
vjust=2, hjust=1.2, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") +
ggtitle(paste("Comparison in", metric, "\n(with R-Squared)")) + scale_color_gradient(low="blue", high="red") +
xlab(xlab) +
ylab(ylab)
if (xlog) {
p <- p +
scale_x_log10()
}
if (ylog) {
p <- p + scale_y_log10()
}
return(p)
}
# Create and add the scatter plots with or without logarithmic transformation of
# x/y scales
lfcTable.comp <- lfcTable.comp %>%
# Add scatter plots to the data frame:
#
# without log transformation of x and y scales
mutate(salmon.star=map(data,
~ comp.scatter.fn(.x, .x$Salmon, .x$STAR,
.x$Rsquared.salmon.star, Metric,
"Salmon", "STAR")),
salmon.hisat=map(data, ~ comp.scatter.fn(.x,
.x$Salmon,
.x$HISAT2,
.x$Rsquared.salmon.hisat,
Metric,
"Salmon", "HISAT2")),
star.hisat=map(data, ~ comp.scatter.fn(.x,
.x$STAR,
.x$HISAT2,
.x$Rsquared.star.hisat,
Metric,
"STAR", "HISAT2")),
# with log transformation of x and y scales
salmon.star.log=map(data,
~ comp.scatter.fn(.x, .x$Salmon, .x$STAR,
.x$Rsquared.salmon.star, Metric,
"Salmon", "STAR",
T, T)),
salmon.hisat.log=map(data, ~ comp.scatter.fn(.x,
.x$Salmon,
.x$HISAT2,
.x$Rsquared.salmon.hisat,
Metric,
"Salmon", "HISAT2",
T, T)),
star.hisat.log=map(data, ~ comp.scatter.fn(.x,
.x$STAR,
.x$HISAT2,
.x$Rsquared.star.hisat,
Metric,
"STAR", "HISAT2",
T, T)))
# Set a function simplifying grid.arrange() function
gr.ar.fn0 <- function(pl1, pl2, pl3) {
grid.arrange(pl1, pl2, pl3, ncol=1)
}
# Print the plots
for (i in 1:length(my.param)) {
p1 <- gr.ar.fn0(lfcTable.comp$salmon.star[[i]],
lfcTable.comp$salmon.hisat[[i]],
lfcTable.comp$star.hisat[[i]])
p2 <- gr.ar.fn0(lfcTable.comp$salmon.star.log[[i]],
lfcTable.comp$salmon.hisat.log[[i]],
lfcTable.comp$star.hisat.log[[i]])
print(grid.arrange(p1, p2, nrow=1))
}
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (1-1,2-2) arrange gtable[arrange]
# Slice the data frame
lfcTable.rel <- lfcTable %>%
dplyr::select(starts_with(c(my.param, "num.")))
# Set a function cleaning my data frame
gather.fn <- function(word) {
gather(lfcTable.rel, Group, Value, starts_with(word)) %>%
dplyr::select(Group, Value, num.trans)
}
# Set a function creating a boxplot
boxplot1.fn <- function(metric, ylog=F) {
df <- subset(lfcTable.rela, Metric == metric)
df$Aligner <- factor(df$Aligner, levels=Aligners)
p <- ggplot(df,
aes(x= Aligner, y=Value, fill=Aligner, label=ANOVA.pval)) +
geom_boxplot(alpha=0.5) +
theme_bw() +
xlab("Aligner") +
ylab("Value") +
theme(strip.text.x=element_text(size=10)) +
ggtitle(paste("Aligner Comparison in",
metric,
"\n(P-value from Two-Tailed ANOVA Test)")) + ylab(metric) +
geom_text(size=5, mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.2)
if (ylog) {
p <- p + scale_y_log10()
}
return(p)
}
# Clean the data frame
lfcTable.rela <- rbind(gather.fn("base"),
gather.fn("log2"),
gather.fn("padj")) %>%
separate(Group, c("Metric", "Aligner"), sep="_") %>%
# Nest by Metric
group_by(Metric) %>%
nest() %>%
# Add columns storing:
# 1. linear regression model
# 2. summary of the model
# 3. R-squared extracted from the model
# 4. ANOVA test
# 5. p-values extracted from the ANOVA objects
mutate(Model=map(data, ~ lm(Value ~ num.trans, data=.x)),
Model_Summary=map(Model, ~ summary(.x)),
Rsquared=map_dbl(Model_Summary, ~ .x$r.squared),
ANOVA=map(data, ~ unlist(summary(aov(Value ~ Aligner, data=.x)))),
ANOVA.pval=map_dbl(ANOVA, ~ round(.x["Pr(>F)1"], 9)))
# Create a vector storing p-values which will be used in plotting
ANOVA.pval <- plotlabels.fn(lfcTable.rela$ANOVA.pval,
lfcTable.rela$data,
nrow(lfcTable.rela))
# Unnest
lfcTable.rela <- lfcTable.rela %>%
unnest(data)
# Add a column storing previously created p-value vector
lfcTable.rela$ANOVA.pval <- ANOVA.pval
# Plot
boxplot1.fn("baseMean", T)
boxplot1.fn("log2FoldChange", T)
boxplot1.fn("padj", F)
# Assign names of the list
col.of.int <- c("Subtraction", "Difference")
# Slice and clean the data frame
lfcTable.diff <- dplyr::select(lfcTable,
starts_with(c("mean_",
"lfc_",
"FDR_"))) %>%
dplyr::select(-ends_with(c("Salmon", "STAR", "HISAT2")))
# Create an empty data frame
lfcTable.sub <- data.frame()
# Clean and save in the data frame
for (x in c("mean_", "lfc_", "FDR_")) {
df <- lfcTable.diff %>%
gather(Subtraction, Difference, starts_with(x)) %>%
dplyr::select(col.of.int)
if (x == "mean_") {
lfcTable.sub <- df
} else {
lfcTable.sub <- rbind(lfcTable.sub, df)
}
}
# Explore the output
head(lfcTable.sub)
## Subtraction Difference
## 1 mean_SA_ST 8.025889
## 2 mean_SA_ST -40.791314
## 3 mean_SA_ST -33.342767
## 4 mean_SA_ST -5.539464
## 5 mean_SA_ST 142.839305
## 6 mean_SA_ST -163.165542
# Rearrange the data frame
lfcTable.sub <- lfcTable.sub %>%
separate(Subtraction, c("Metric", "From", "To"), sep="_") %>%
mutate(Metric=ifelse(Metric == "mean", "baseMean",
ifelse(Metric == "lfc", "log2FoldChange", "padj")),
From=ifelse(From == "SA", "Salmon",
ifelse(From == "ST", "STAR", "HISAT2")),
To=ifelse(To == "SA", "Salmon",
ifelse(To == "ST", "STAR", "HISAT2")),
Subtraction=paste(From, "-", To)) %>%
# Nest by Metric
group_by(Metric) %>%
nest()
# Conduct ANOVA test in the subtraction data
lfcTable.sub <- lfcTable.sub %>%
mutate(ANOVA=map(data, ~aov(Difference ~ Subtraction, data=.x)),
ANOVA.summary=map(ANOVA, ~ unlist(summary(.x))),
ANOVA.pval=map_dbl(ANOVA.summary, ~ .x["Pr(>F)1"]))
# Create a vector storing pvalue labels
sub.ANOVA.pval <- plotlabels.fn(lfcTable.sub$ANOVA.pval,
lfcTable.sub$data,
nrow(lfcTable.sub))
# Unnest the data frame
lfcTable.sub <- lfcTable.sub %>%
unnest(data)
# Add a column storing pvalues
lfcTable.sub$ANOVA.pval <- sub.ANOVA.pval
# Set a function creating a box plot
boxplot2.fn <- function(metric, ylog=F) {
df <- subset(lfcTable.sub, Metric == metric)
p <- ggplot(df,
aes(x= Subtraction, y=abs(Difference),
fill=Subtraction, label=ANOVA.pval)) +
geom_boxplot(alpha=0.5) +
theme_bw() +
xlab("Subtraction") +
ylab("Difference") +
theme(strip.text.x=element_text(size=10)) +
ggtitle(paste("Subtraction Comparison in",
metric,
"\n(P-value from Two-Tailed ANOVA Test)")) +
geom_text(size=5, mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.2)
if (ylog) {
p <- p + scale_y_log10()
}
return(p)
}
# Print the boxplots comparing subtraction & statistical significance by metric
boxplot2.fn("baseMean", T)
boxplot2.fn("log2FoldChange", T)
boxplot2.fn("padj", F)
# Create an empty data frame
lfcTable.sd <- data.frame()
for (x in my.param) {
# Slice the data frame with columns of interest
df <- dplyr::select(lfcTable,
starts_with(x))
# Calculate Mean across the aligners
df$Mean <- rowMeans(df)
# Calculate SD across the aligners
df$SD <- apply(df[, 1:3], 1, sd)
# Add a column storing the number of alternative transcripts
df$num.trans <- lfcTable$num.trans
# Change column names
colnames(df) <- str_replace(colnames(df), paste0(x, "_"), "")
# Add a column storing metric
df$Metric <- x
# Save as a data frame
if (x == my.param[1]) {
lfcTable.sd <- df
} else {
lfcTable.sd <- rbind(lfcTable.sd, df)
}
}
# Explore the output data frame
head(lfcTable.sd)
## Salmon STAR HISAT2 Mean SD num.trans Metric
## 1 1587.7857 1579.75977 1519.05864 1562.20136 37.577579 14 baseMean
## 2 685.4758 726.26711 687.30008 699.68100 23.042312 25 baseMean
## 3 2240.0327 2273.37550 2236.78012 2250.06278 20.254799 1 baseMean
## 4 53.1859 58.72536 57.00722 56.30616 2.835495 11 baseMean
## 5 1075.5410 932.70171 895.53964 967.92746 95.030332 9 baseMean
## 6 2980.8300 3143.99551 3047.53624 3057.45390 82.033643 75 baseMean
dim(lfcTable.sd)
## [1] 10770 7
# Nest the data frame by Metric
lfcTable.sd <- lfcTable.sd %>%
group_by(Metric) %>%
nest()
# Explore the output data frame
lfcTable.sd
## # A tibble: 3 x 2
## # Groups: Metric [3]
## Metric data
## <chr> <list>
## 1 baseMean <tibble [3,590 × 6]>
## 2 log2FoldChange <tibble [3,590 × 6]>
## 3 padj <tibble [3,590 × 6]>
# Set a function creating a scatter plot demonstrating SD over the number of alternative transcripts
scatter1.fn <- function(df, mlog=F, ylog=F, metric) {
if (mlog) {
p <- ggplot(df, aes(x=num.trans, y=SD, color=log(Mean)))
} else {
p <- ggplot(df, aes(x=num.trans, y=SD, color=Mean))
}
p <- p +
geom_point(alpha=0.5) +
theme_bw() +
xlab("Number of Transcripts") +
ggtitle(paste("Relationship between",
metric,
"Standard Deviation and\nNumber of Transcripts")) +
scale_color_gradient(low="blue", high="red")
if (ylog) {
p <- p +
scale_y_log10() +
ylab("Log (Standard Deviation)")
} else {
p <- p + ylab("Standard Deviation")
}
return(p)
}
# Create and save the scatter plots
lfcTable.sd <- lfcTable.sd %>%
# sd.plot = No log transformation in the plot
# sd.plot.logMean = log(Mean)
# sd.plot.logY = log(SD),
# sd.plot.logBoth = log(Mean) & log(SD)
mutate(sd.plot=map(data, ~ scatter1.fn(.x, F, F, Metric)),
sd.plot.logMean=map(data, ~ scatter1.fn(.x, T, F, Metric)),
sd.plot.logY=map(data, ~ scatter1.fn(.x, F, T, Metric)),
sd.plot.logBoth=map(data, ~ scatter1.fn(.x, T, T, Metric)))
# Set a function shortening scripts
gr.ar.fn2 <- function(pl1, pl2, pl3) {
grid.arrange(pl1, pl2, pl3, nrow=1)
}
# Explore the plots
gr.ar.fn2(lfcTable.sd$sd.plot[[1]],
lfcTable.sd$sd.plot[[2]],
lfcTable.sd$sd.plot[[3]])
gr.ar.fn2(lfcTable.sd$sd.plot.logMean[[1]],
lfcTable.sd$sd.plot.logMean[[2]],
lfcTable.sd$sd.plot.logMean[[3]])
gr.ar.fn2(lfcTable.sd$sd.plot.logY[[1]],
lfcTable.sd$sd.plot.logY[[2]],
lfcTable.sd$sd.plot.logY[[3]])
gr.ar.fn2(lfcTable.sd$sd.plot.logBoth[[1]],
lfcTable.sd$sd.plot.logBoth[[2]],
lfcTable.sd$sd.plot.logBoth[[3]])
# Print the optimal presentation
gr.ar.fn2(lfcTable.sd$sd.plot.logBoth[[1]],
lfcTable.sd$sd.plot.logBoth[[2]],
lfcTable.sd$sd.plot[[3]])
# Add columns storing percent difference
lfcTable <- lfcTable %>%
# mDiff: difference in meanBase
# lDiff: difference in log2FoldChange
# fDiff: difference in padj (FDR)
mutate(mDiff_percent=100 * mean_SA_ST / (baseMean_Salmon + baseMean_STAR),
lDiff_percent=100 * lfc_SA_ST / (abs(log2FoldChange_Salmon) + abs(log2FoldChange_STAR)),
fDiff_percent=100 * FDR_SA_ST / (padj_Salmon + padj_STAR))
# Explore the output
head(lfcTable)
## GENEID baseMean_Salmon log2FoldChange_Salmon lfcSE_Salmon
## 1 ENSG00000001084 1587.7857 0.5853638 0.1350708
## 2 ENSG00000001497 685.4758 -0.3467204 0.1440032
## 3 ENSG00000001561 2240.0327 0.6073274 0.1311473
## 4 ENSG00000002746 53.1859 -1.0269975 0.3628482
## 5 ENSG00000002834 1075.5410 -0.6594280 0.1275030
## 6 ENSG00000003393 2980.8300 -0.3125891 0.1081754
## stat_Salmon pvalue_Salmon padj_Salmon FDR_Salmon Alignment_Salmon
## 1 4.333754 1.465878e-05 2.334551e-04 < 0.1 Salmon
## 2 -2.407727 1.605219e-02 7.910015e-02 < 0.1 Salmon
## 3 4.630878 3.641179e-06 6.871668e-05 < 0.1 Salmon
## 4 -2.830378 4.649307e-03 3.015041e-02 < 0.1 Salmon
## 5 -5.171864 2.317703e-07 6.121000e-06 < 0.1 Salmon
## 6 -2.889651 3.856695e-03 2.593355e-02 < 0.1 Salmon
## baseMean_STAR log2FoldChange_STAR lfcSE_STAR stat_STAR pvalue_STAR
## 1 1579.75977 0.5664292 0.1342064 4.220584 2.436705e-05
## 2 726.26711 -0.3417999 0.1371200 -2.492707 1.267735e-02
## 3 2273.37550 0.5989872 0.1297204 4.617524 3.883459e-06
## 4 58.72536 -1.0444676 0.3265005 -3.198977 1.379162e-03
## 5 932.70171 -0.6567112 0.1267828 -5.179814 2.221074e-07
## 6 3143.99551 -0.3221310 0.1063918 -3.027779 2.463580e-03
## padj_STAR FDR_STAR Alignment_STAR baseMean_HISAT2 log2FoldChange_HISAT2
## 1 3.973616e-04 < 0.1 STAR 1519.05864 0.5833814
## 2 6.833087e-02 < 0.1 STAR 687.30008 -0.3278163
## 3 7.878088e-05 < 0.1 STAR 2236.78012 0.5980731
## 4 1.189267e-02 < 0.1 STAR 57.00722 -1.0240761
## 5 6.212536e-06 < 0.1 STAR 895.53964 -0.6698121
## 6 1.904315e-02 < 0.1 STAR 3047.53624 -0.3097336
## lfcSE_HISAT2 stat_HISAT2 pvalue_HISAT2 padj_HISAT2 FDR_HISAT2
## 1 0.1317808 4.426908 9.559346e-06 1.804589e-04 < 0.1
## 2 0.1383529 -2.369421 1.781595e-02 9.157123e-02 < 0.1
## 3 0.1303310 4.588880 4.456302e-06 9.161883e-05 < 0.1
## 4 0.3277511 -3.124554 1.780749e-03 1.523395e-02 < 0.1
## 5 0.1266770 -5.287558 1.239603e-07 3.828322e-06 < 0.1
## 6 0.1062535 -2.915043 3.556398e-03 2.650504e-02 < 0.1
## Alignment_HISAT2 mean_SA_ST mean_SA_HI mean_ST_HI lfc_SA_ST lfc_SA_HI
## 1 HISAT2 8.025889 68.727024 60.70114 0.018934584 0.001982376
## 2 HISAT2 -40.791314 -1.824283 38.96703 -0.004920541 -0.018904180
## 3 HISAT2 -33.342767 3.252613 36.59538 0.008340129 0.009254245
## 4 HISAT2 -5.539464 -3.821324 1.71814 0.017470176 -0.002921357
## 5 HISAT2 142.839305 180.001381 37.16208 -0.002716743 0.010384170
## 6 HISAT2 -163.165542 -66.706268 96.45927 0.009541858 -0.002855531
## lfc_ST_HI FDR_SA_ST FDR_SA_HI FDR_ST_HI num.trans
## 1 -0.0169522076 -1.639066e-04 5.299613e-05 2.169027e-04 14
## 2 -0.0139836398 1.076927e-02 -1.247108e-02 -2.324035e-02 25
## 3 0.0009141155 -1.006420e-05 -2.290216e-05 -1.283796e-05 1
## 4 -0.0203915329 1.825774e-02 1.491646e-02 -3.341288e-03 11
## 5 0.0131009125 -9.153651e-08 2.292678e-06 2.384215e-06 9
## 6 -0.0123973888 6.890397e-03 -5.714961e-04 -7.461893e-03 75
## mDiff_percent lDiff_percent fDiff_percent
## 1 0.2533788 1.6439226 -25.9832352
## 2 -2.8894293 -0.7146543 7.3046171
## 3 -0.7387492 0.6913727 -6.8233015
## 4 -4.9498713 0.8433729 43.4262817
## 5 7.1126514 -0.2064176 -0.7421757
## 6 -2.6640031 1.5033175 15.3199245
dim(lfcTable)
## [1] 3590 38
# Create an empty data frame
lfcTable.percent <- data.frame()
for (x in my.param) {
init.vec <- c("GENEID", "num.trans")
# Set columns of interest
if (x == my.param[1]) {
col.of.interest <- c(init.vec, "mean_SA_ST", "mDiff_percent")
} else if (x == my.param[2]) {
col.of.interest <- c(init.vec, "lfc_SA_ST", "lDiff_percent")
} else {
col.of.interest <- c(init.vec, "FDR_SA_ST", "fDiff_percent")
}
# Trim the table
df <- lfcTable[, colnames(lfcTable) %in% col.of.interest] %>%
mutate(Metric=x) %>%
dplyr::rename(Difference=ends_with("ST"),
Percent=ends_with("percent"))
# Save in the empty data frame
if (x == my.param[1]) {
lfcTable.percent <- df
} else {
lfcTable.percent <- rbind(lfcTable.percent, df)
}
}
# Explore the output
head(lfcTable.percent)
## GENEID Difference num.trans Percent Metric
## 1 ENSG00000001084 8.025889 14 0.2533788 baseMean
## 2 ENSG00000001497 -40.791314 25 -2.8894293 baseMean
## 3 ENSG00000001561 -33.342767 1 -0.7387492 baseMean
## 4 ENSG00000002746 -5.539464 11 -4.9498713 baseMean
## 5 ENSG00000002834 142.839305 9 7.1126514 baseMean
## 6 ENSG00000003393 -163.165542 75 -2.6640031 baseMean
dim(lfcTable.percent)
## [1] 10770 5
# Nest the data frame by Metric
lfcTable.percent <- lfcTable.percent %>%
group_by(Metric) %>%
nest()
# Explore the output
lfcTable.percent
## # A tibble: 3 x 2
## # Groups: Metric [3]
## Metric data
## <chr> <list>
## 1 baseMean <tibble [3,590 × 4]>
## 2 log2FoldChange <tibble [3,590 × 4]>
## 3 padj <tibble [3,590 × 4]>
head(lfcTable.percent$data[[1]])
## # A tibble: 6 x 4
## GENEID Difference num.trans Percent
## <chr> <dbl> <int> <dbl>
## 1 ENSG00000001084 8.03 14 0.253
## 2 ENSG00000001497 -40.8 25 -2.89
## 3 ENSG00000001561 -33.3 1 -0.739
## 4 ENSG00000002746 -5.54 11 -4.95
## 5 ENSG00000002834 143. 9 7.11
## 6 ENSG00000003393 -163. 75 -2.66
# Calculate correlation (Rsquared)
# DvsP: Percent Difference vs Difference
# DvsT: Difference vs Number of Transcripts
# PvsT: Percent Difference vs Number of Transcripts
lfcTable.percent <- lfcTable.percent %>%
mutate(DvsP.mod=map(data, ~ lm(Percent ~ Difference, data=.x)),
DvsP.rsq=map_dbl(DvsP.mod, ~ summary(.x)$r.squared),
DvsT.mod=map(data, ~ lm(Difference ~ num.trans, data=.x)),
DvsT.rsq=map_dbl(DvsT.mod, ~ summary(.x)$r.squared),
PvsT.mod=map(data, ~ lm(Percent ~ num.trans, data=.x)),
PvsT.rsq=map_dbl(PvsT.mod, ~ summary(.x)$r.squared))
# Add Rsquard values to the data frame as plot label columns
DvsP.r <- plotlabels.fn(lfcTable.percent$DvsP.rsq,
lfcTable.percent$data,
nrow(lfcTable.percent))
DvsT.r <- plotlabels.fn(lfcTable.percent$DvsT.rsq,
lfcTable.percent$data,
nrow(lfcTable.percent))
PvsT.r <- plotlabels.fn(lfcTable.percent$PvsT.rsq,
lfcTable.percent$data,
nrow(lfcTable.percent))
lfcTable.percent <- lfcTable.percent %>%
unnest(data)
lfcTable.percent$DvsP.Rsquared <- DvsP.r
lfcTable.percent$DvsT.Rsquared <- DvsT.r
lfcTable.percent$PvsT.Rsquared <- PvsT.r
# Explore the output data frame
head(lfcTable.percent)
## # A tibble: 6 x 14
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent DvsP.mod DvsP.rsq DvsT.mod DvsT.rsq
## <chr> <chr> <dbl> <int> <dbl> <list> <dbl> <list> <dbl>
## 1 baseM… ENSG0… 8.03 14 0.253 <lm> 0.0396 <lm> 0.000269
## 2 baseM… ENSG0… -40.8 25 -2.89 <lm> 0.0396 <lm> 0.000269
## 3 baseM… ENSG0… -33.3 1 -0.739 <lm> 0.0396 <lm> 0.000269
## 4 baseM… ENSG0… -5.54 11 -4.95 <lm> 0.0396 <lm> 0.000269
## 5 baseM… ENSG0… 143. 9 7.11 <lm> 0.0396 <lm> 0.000269
## 6 baseM… ENSG0… -163. 75 -2.66 <lm> 0.0396 <lm> 0.000269
## # … with 5 more variables: PvsT.mod <list>, PvsT.rsq <dbl>,
## # DvsP.Rsquared <chr>, DvsT.Rsquared <chr>, PvsT.Rsquared <chr>
# Set a function creating scatter plot
scatter2.fn <- function(label.col, xtitle, ytitle, title, ylog=F, xlog=F, label=F) {
p <- ggplot(lfcTable.percent) +
theme_bw() +
scale_color_gradient(low="blue", high="red") +
facet_wrap(~Metric, scales = "free") +
theme(strip.text.x=element_text(size=10)) +
ggtitle(paste(title, "in Salmon vs STAR")) +
xlab(xtitle) +
ylab(ytitle)
if (ylog) {
p <- p + scale_y_log10() + ylab(paste0("Log (", ytitle, ")"))
}
if (xlog) {
p <- p + scale_x_log10() + xlab(paste0("Log (", xtitle, ")"))
}
if (label) {
p <- p + geom_text(data=lfcTable.percent,
size=5,
mapping=aes(label=label.col, x=Inf, y=Inf),
vjust=2, hjust=1.1, color="black") +
ggtitle(paste(title, "in Salmon vs STAR", "(with R-Squared)"))
}
return(p)
}
# Create and save the plots
# P1 & 2: D vs P with or without log transformation of x values
# P3 & 4: P vs T with or without log transformation of y values
# P5 & 6: D vs T with or without log transformation of y values
p1 <- scatter2.fn(lfcTable.percent$DvsP.Rsquared,
"Salmon - STAR",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Difference vs Percent Difference", label=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=abs(Difference), y=abs(Percent), color=log(num.trans)))
p2 <- scatter2.fn(lfcTable.percent$DvsP.Rsquared,
"Salmon - STAR",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Difference vs Percent Difference",
xlog=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=abs(Difference), y=abs(Percent), color=log(num.trans)))
p3 <- scatter2.fn(lfcTable.percent$PvsT.Rsquared,
"Number of Alternative Transcripts",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Percent Difference vs Number of Alternative Transcripts",
label=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Percent)))
p4 <- scatter2.fn(lfcTable.percent$PvsT.Rsquared,
"Number of Alternative Transcripts",
"100 x (Salmon - STAR) / (Salmon + STAR)",
"Percent Difference vs Number of Alternative Transcripts",
ylog=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Percent)))
p5 <- scatter2.fn(lfcTable.percent$DvsT.Rsquared,
"Number of Alternative Transcripts",
"Salmon - STAR",
"Difference vs Number of Alternative Transcripts",
label=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Difference)))
p6 <- scatter2.fn(lfcTable.percent$DvsT.Rsquared,
"Number of Alternative Transcripts",
"Salmon - STAR",
"Difference vs Number of Alternative Transcripts",
ylog=T) +
geom_point(data=lfcTable.percent, alpha=0.5,
aes(x=num.trans, y=abs(Difference)))
# Print the plots
grid.arrange(p1, p2, ncol=1)
grid.arrange(p3, p4, ncol=1)
grid.arrange(p5, p6, ncol=1)
# Set a function rearranging a data frame
rank.fn <- function(df, var, desc=F) {
if (desc) {
df <- dplyr::arrange(df, desc(var))
} else {
df <- dplyr::arrange(df, var)
}
return(df)
}
# Set a function creating a rank plot
rank.plot.fn <- function(df, metric, colog=F) {
if (colog) {
p <- ggplot(df, aes(x=Rank.x,
y=Rank.y,
color=log(MeanValue),
label=Rsquared))
} else {
p <- ggplot(df, aes(x=Rank.x,
y=Rank.y,
color=MeanValue,
label=Rsquared))
}
p <- p +
geom_point(alpha=0.5) +
theme_bw() +
scale_color_gradient(low="blue", high="red") +
xlab("Salmon") +
ylab("STAR") +
ggtitle(paste("Rank Comparison in", metric, "\n(with R-Squared)")) + geom_text(size=5,
mapping=aes(x=Inf, y=Inf), vjust=2, hjust=1.0, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black")
return(p)
}
# Clean the data frame
lfcTable.rank <- lfcTable %>%
# Splice by columns of interest
dplyr::select(GENEID,
starts_with(c("baseMean_", "log2FoldChange_", "padj_")),
-ends_with("HISAT2")) %>%
# Reform the table
gather(Metric, Value, -GENEID) %>%
# Nest by Metric
group_by(Metric) %>%
nest() %>%
# Add columns storing ascending or descending order by variable of interest
mutate(Ascending=map(data, ~ rank.fn(.x, .x$Value)),
Descending=map(data, ~ rank.fn(.x, .x$Value, T)),
Final=map2(Ascending,
Descending,
~ ifelse(Metric %in% c("padj_Salmon", "padj_STAR"),
Ascending,
Descending)),
Final=map(Final, ~ .x[[1]]),
# Add a column storing rank
Final=map(Final, ~ mutate(.x, Rank=1:nrow(.x)))) %>%
# Unnest
unnest(Final) %>%
# Reform the table
separate(Metric, c("Metric", "Aligner")) %>%
dplyr::select(-data, -Ascending, -Descending) %>%
# Renest by metric
group_by(Metric) %>%
nest() %>%
# Reclean the data frame
mutate(Split=map(data, ~ split(.x, .x$Aligner)),
Joined=map(Split, ~ inner_join(.x[[1]],
.x[[2]],
by=c("GENEID")))) %>%
dplyr::select(-data, -Split) %>%
# Unnest
unnest(Joined) %>%
# Calculate and save MeanRank and MeanValue
mutate(MeanRank=(Rank.x + Rank.y)/2,
MeanValue=abs((Value.x + Value.y)/2)) %>%
# Nest by Metric
nest(-Metric) %>%
# Add a column storing R^2
mutate(Rsquared=map_dbl(data, ~ cor(.x$Rank.x, .x$Rank.y)))
# Explore the data frame
head(lfcTable.rank)
## # A tibble: 3 x 3
## # Groups: Metric [3]
## Metric data Rsquared
## <chr> <list> <dbl>
## 1 baseMean <tibble [3,590 × 9]> 0.984
## 2 log2FoldChange <tibble [3,590 × 9]> 0.997
## 3 padj <tibble [3,590 × 9]> 0.945
head(lfcTable.rank$data)
## [[1]]
## # A tibble: 3,590 x 9
## Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
## <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl> <dbl>
## 1 Salmon ENSG000… 1.41e6 1 STAR 1.61e6 1 1 1512960.
## 2 Salmon ENSG000… 4.30e5 2 STAR 4.50e5 2 2 440247.
## 3 Salmon ENSG000… 2.72e5 3 STAR 2.81e5 3 3 276488.
## 4 Salmon ENSG000… 2.03e5 4 STAR 2.16e5 4 4 209624.
## 5 Salmon ENSG000… 1.09e5 5 STAR 1.17e5 6 5.5 113195.
## 6 Salmon ENSG000… 7.08e4 6 STAR 7.37e4 8 7 72243.
## 7 Salmon ENSG000… 7.02e4 7 STAR 7.29e4 9 8 71539.
## 8 Salmon ENSG000… 6.07e4 8 STAR 6.30e4 10 9 61829.
## 9 Salmon ENSG000… 5.08e4 9 STAR 4.41e4 16 12.5 47463.
## 10 Salmon ENSG000… 4.94e4 10 STAR 5.07e4 11 10.5 50018.
## # … with 3,580 more rows
##
## [[2]]
## # A tibble: 3,590 x 9
## Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank MeanValue
## <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl> <dbl>
## 1 Salmon ENSG000… 3.83 1 STAR 1.39 156 78.5 2.61
## 2 Salmon ENSG000… 3.76 2 STAR 3.58 3 2.5 3.67
## 3 Salmon ENSG000… 3.17 3 STAR 3.66 1 2 3.42
## 4 Salmon ENSG000… 3.16 4 STAR 2.85 8 6 3.00
## 5 Salmon ENSG000… 3.10 5 STAR 3.16 5 5 3.13
## 6 Salmon ENSG000… 3.07 6 STAR 3.19 4 5 3.13
## 7 Salmon ENSG000… 3.04 7 STAR 2.87 7 7 2.96
## 8 Salmon ENSG000… 2.98 8 STAR 3.59 2 5 3.28
## 9 Salmon ENSG000… 2.80 9 STAR 2.51 13 11 2.65
## 10 Salmon ENSG000… 2.76 10 STAR 2.66 10 10 2.71
## # … with 3,580 more rows
##
## [[3]]
## # A tibble: 3,590 x 9
## Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank
## <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl>
## 1 Salmon ENSG000002610… 2.01e-117 1 STAR 6.48e-117 1 1
## 2 Salmon ENSG000001699… 5.84e-114 2 STAR 3.41e-114 2 2
## 3 Salmon ENSG000001185… 1.55e- 77 3 STAR 1.92e- 75 3 3
## 4 Salmon ENSG000001369… 8.64e- 68 4 STAR 3.74e- 67 4 4
## 5 Salmon ENSG000001811… 9.29e- 60 5 STAR 3.00e- 60 6 5.5
## 6 Salmon ENSG000001092… 2.52e- 59 6 STAR 4.64e- 67 5 5.5
## 7 Salmon ENSG000002114… 2.33e- 57 7 STAR 5.70e- 57 7 7
## 8 Salmon ENSG000001736… 2.14e- 55 8 STAR 5.19e- 33 27 17.5
## 9 Salmon ENSG000001387… 2.22e- 54 9 STAR 4.17e- 55 8 8.5
## 10 Salmon ENSG000000384… 3.60e- 53 10 STAR 6.36e- 53 10 10
## # … with 3,580 more rows, and 1 more variable: MeanValue <dbl>
# Create a vector storing R^2 values as an input of ggplot2()
Rsquared <- plotlabels.fn(lfcTable.rank$Rsquared,
lfcTable.rank$data,
nrow(lfcTable.rank))
# Unnest and add a column storing ggplot2's input R^2
lfcTable.rank <- lfcTable.rank %>%
unnest(data)
lfcTable.rank$Rsquared <- Rsquared
# Check out the new column added
head(lfcTable.rank)
## # A tibble: 6 x 11
## # Groups: Metric [1]
## Metric Aligner.x GENEID Value.x Rank.x Aligner.y Value.y Rank.y MeanRank
## <chr> <chr> <chr> <dbl> <int> <chr> <dbl> <int> <dbl>
## 1 baseMe… Salmon ENSG000001… 1.41e6 1 STAR 1.61e6 1 1
## 2 baseMe… Salmon ENSG000002… 4.30e5 2 STAR 4.50e5 2 2
## 3 baseMe… Salmon ENSG000001… 2.72e5 3 STAR 2.81e5 3 3
## 4 baseMe… Salmon ENSG000002… 2.03e5 4 STAR 2.16e5 4 4
## 5 baseMe… Salmon ENSG000001… 1.09e5 5 STAR 1.17e5 6 5.5
## 6 baseMe… Salmon ENSG000001… 7.08e4 6 STAR 7.37e4 8 7
## # … with 2 more variables: MeanValue <dbl>, Rsquared <chr>
# Clean the data frame
lfcTable.rank <- lfcTable.rank %>%
# Nest by metric
group_by(Metric) %>%
nest() %>%
# Add columns storing scatter plots
mutate(RankPlot=map(data, ~ rank.plot.fn(.x, Metric)),
RankPlot.log=map(data, ~ rank.plot.fn(.x, Metric, T)))
# Set a function simplifying grid.arrange()
gr.ar.fn3 <- function(pl1, pl2) {
grid.arrange(pl1, pl2, nrow=1)
}
# Explore the plots
gr.ar.fn3(lfcTable.rank$RankPlot[[1]],
lfcTable.rank$RankPlot.log[[1]])
gr.ar.fn3(lfcTable.rank$RankPlot[[2]],
lfcTable.rank$RankPlot.log[[2]])
gr.ar.fn3(lfcTable.rank$RankPlot[[3]],
lfcTable.rank$RankPlot.log[[3]])
# Complete the optimal presentation for the plots
gr.ar.fn2(lfcTable.rank$RankPlot.log[[1]],
lfcTable.rank$RankPlot.log[[2]],
lfcTable.rank$RankPlot[[3]])
for (x in my.param) {
# Subset by metric
df <- subset(lfcTable.percent, Metric == x)[, 1:5]
# Determine whether ascending or descending order is needed
if (x == "padj") {
# Descending for padj
ordering <- F
} else {
# Ascending otherwise
ordering <- T
}
# Rearrange the data frame using the function rank.fn() set previously
df <- rank.fn(df, abs(df$Percent), desc=ordering)
# Print the output data frame
print(head(df))
# Save as a csv file
write.csv(df,
paste0(x, "_difference_SA_ST.csv"))
}
## # A tibble: 6 x 5
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent
## <chr> <chr> <dbl> <int> <dbl>
## 1 baseMean ENSG00000233476 -111827. 1 -99.7
## 2 baseMean ENSG00000196205 -128658. 1 -99.6
## 3 baseMean ENSG00000214391 -6445. 1 -98.8
## 4 baseMean ENSG00000196136 1476. 9 98.1
## 5 baseMean ENSG00000010404 5438. 8 94.9
## 6 baseMean ENSG00000167553 -21628. 10 -92.1
## # A tibble: 6 x 5
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent
## <chr> <chr> <dbl> <int> <dbl>
## 1 log2FoldChange ENSG00000254634 1.21 1 61.7
## 2 log2FoldChange ENSG00000254206 1.17 2 60.3
## 3 log2FoldChange ENSG00000244722 -1.54 1 -52.7
## 4 log2FoldChange ENSG00000279750 2.44 2 46.7
## 5 log2FoldChange ENSG00000196205 0.539 1 46.1
## 6 log2FoldChange ENSG00000260518 -0.788 4 -43.9
## # A tibble: 6 x 5
## # Groups: Metric [1]
## Metric GENEID Difference num.trans Percent
## <chr> <chr> <dbl> <int> <dbl>
## 1 padj ENSG00000113569 -0.000000577 8 -0.000670
## 2 padj ENSG00000185942 0.0000000115 12 0.00189
## 3 padj ENSG00000144028 -0.00000682 9 -0.00667
## 4 padj ENSG00000071967 -0.000000421 7 -0.00711
## 5 padj ENSG00000024862 -0.0000000466 1 -0.00789
## 6 padj ENSG00000277443 -0.00000116 1 -0.0118
lfcTable.csv <- lfcTable %>%
dplyr::select(GENEID, starts_with(my.param[-1])) %>%
gather(Category, Value, -GENEID) %>%
separate(Category, c("Metric", "Aligner")) %>%
pivot_wider(names_from=Metric, values_from=Value)
write.csv(lfcTable.csv, "lfcTable.csv")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] gridExtra_2.3 ensembldb_2.14.0
## [3] AnnotationFilter_1.14.0 GenomicFeatures_1.42.1
## [5] AnnotationDbi_1.52.0 UpSetR_1.4.0
## [7] DESeq2_1.30.1 SummarizedExperiment_1.20.0
## [9] Biobase_2.50.0 MatrixGenerics_1.2.0
## [11] matrixStats_0.58.0 GenomicRanges_1.42.0
## [13] GenomeInfoDb_1.26.0 IRanges_2.24.1
## [15] S4Vectors_0.28.1 Rsubread_2.4.0
## [17] tximport_1.18.0 AnnotationHub_2.22.0
## [19] BiocFileCache_1.14.0 dbplyr_2.1.0
## [21] BiocGenerics_0.36.0 pheatmap_1.0.12
## [23] rmarkdown_2.7 forcats_0.5.1
## [25] stringr_1.4.0 dplyr_1.0.5
## [27] purrr_0.3.4 readr_1.4.0
## [29] tidyr_1.1.3 tibble_3.1.0
## [31] ggplot2_3.3.3 tidyverse_1.3.0
## [33] data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] XVector_0.30.0 fs_1.5.0
## [5] rstudioapi_0.13 farver_2.0.3
## [7] bit64_4.0.5 interactiveDisplayBase_1.28.0
## [9] fansi_0.4.2 lubridate_1.7.10
## [11] xml2_1.3.2 splines_4.0.3
## [13] cachem_1.0.4 geneplotter_1.68.0
## [15] knitr_1.31 jsonlite_1.7.2
## [17] Rsamtools_2.6.0 broom_0.7.5
## [19] annotate_1.68.0 shiny_1.6.0
## [21] BiocManager_1.30.10 compiler_4.0.3
## [23] httr_1.4.2 backports_1.2.1
## [25] lazyeval_0.2.2 assertthat_0.2.1
## [27] Matrix_1.3-2 fastmap_1.1.0
## [29] cli_2.3.1 later_1.1.0.1
## [31] prettyunits_1.1.1 htmltools_0.5.1.1
## [33] tools_4.0.3 gtable_0.3.0
## [35] glue_1.4.2 GenomeInfoDbData_1.2.4
## [37] rappdirs_0.3.3 Rcpp_1.0.6
## [39] cellranger_1.1.0 jquerylib_0.1.3
## [41] Biostrings_2.58.0 vctrs_0.3.6
## [43] rtracklayer_1.50.0 xfun_0.21
## [45] ps_1.5.0 rvest_0.3.6
## [47] mime_0.10 lifecycle_1.0.0
## [49] XML_3.99-0.5 zlibbioc_1.36.0
## [51] scales_1.1.1 ProtGenerics_1.22.0
## [53] hms_1.0.0 promises_1.2.0.1
## [55] RColorBrewer_1.1-2 yaml_2.2.1
## [57] curl_4.3 memoise_2.0.0
## [59] sass_0.3.1 biomaRt_2.46.3
## [61] stringi_1.5.3 RSQLite_2.2.3
## [63] highr_0.8 BiocVersion_3.12.0
## [65] genefilter_1.72.1 BiocParallel_1.24.0
## [67] rlang_0.4.10 pkgconfig_2.0.3
## [69] bitops_1.0-6 evaluate_0.14
## [71] lattice_0.20-41 labeling_0.4.2
## [73] GenomicAlignments_1.26.0 bit_4.0.4
## [75] tidyselect_1.1.0 plyr_1.8.6
## [77] magrittr_2.0.1 R6_2.5.0
## [79] generics_0.1.0 DelayedArray_0.16.0
## [81] DBI_1.1.1 pillar_1.5.0
## [83] haven_2.3.1 withr_2.4.1
## [85] survival_3.2-7 RCurl_1.98-1.2
## [87] modelr_0.1.8 crayon_1.4.1
## [89] utf8_1.1.4 progress_1.2.2
## [91] locfit_1.5-9.4 grid_4.0.3
## [93] readxl_1.3.1 blob_1.2.1
## [95] reprex_1.0.0 digest_0.6.27
## [97] xtable_1.8-4 httpuv_1.5.5
## [99] openssl_1.4.3 munsell_0.5.0
## [101] bslib_0.2.4 askpass_1.1